iT邦幫忙

2023 iThome 鐵人賽

DAY 22
0
Software Development

開心撰寫 PHPUnit系列 第 22

Day 22. 重構分頁 - 想知道分頁資訊怎麼辦

  • 分享至 

  • xImage
  •  

在上一篇我們最終的程式碼

<?php

namespace Recca0120\Ithome30\Crawlers;

use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;

class Board
{
    public function __construct(private ClientInterface $httpClient)
    {
    }

    public function fetch(array $board, ?int $take = null)
    {
        $url = $board['url'];

        $results = [];
        $page = 0;
        do {
            $page++;
            $html = $this->sendRequest($url);
            $rows = array_map(
                fn (string $row)  => $this->parseCols($row, $board),
                $this->parseRows($html)
            );
            $results[] =  $rows;
            $pagination = $this->parsePagination($html);
            if ($take !== null && $page >= $take) {
                break;
            }
            $url = $pagination['prev'];
        } while ($pagination['prev'] !== null);

        return $results;
    }

    private function sendRequest($url)
    {
        $request = new Request('GET', $url, [
            'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding' => 'gzip, deflate, br',
            'Accept-Language' => 'zh-TW,zh;q=0.8',
            'Cache-Control' => 'max-age=0',
            'Cookie' => 'over18=1',
            'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
            'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'Sec-Ch-Ua-Mobile' => '?0',
            'Sec-Ch-Ua-Platform' => '"macOS"',
            'Sec-Fetch-Dest' => 'document',
            'Sec-Fetch-Mode' => 'navigate',
            'Sec-Fetch-Site' => 'same-origin',
            'Sec-Fetch-User' => '?1',
            'Sec-Gpc' => '1',
            'Upgrade-Insecure-Requests' => '1',
            'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        ]);
        $response = $this->httpClient->sendRequest($request);
        $html = (string)$response->getBody();

        return $html;
    }

    private function parsePagination($html)
    {
        preg_match_all('/<a class="btn wide( disabled)?"( href="(?<href>.+)")?>.*(?<name>(最舊|上頁|下頁|最新))[^<]*?<\/a>/', $html, $matches);
        $lookup = ['最舊' => 'oldest', '上頁' => 'prev', '下頁' => 'next', '最新' => 'latest'];
        $pagination = [];
        foreach (array_keys($matches[0]) as $index) {
            $pagination[$lookup[$matches['name'][$index]]] = $matches['href'][$index]
                ? 'https://www.ptt.cc' . $matches['href'][$index]
                : null;
        }

        return $pagination;
    }

    private function parseCols($row, $board)
    {
        preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches);

        $cols = [
            'board_name' => $board['name'],
            'board_class' => $board['class'],
        ];

        foreach (array_keys($matches[0]) as $index) {
            $cols[$matches['name'][$index]] = trim($matches['value'][$index]);
        }
        $cols['nrec'] = strip_tags($cols['nrec']);

        preg_match('/href="(.*)"/', $cols['title'], $matched);
        $cols['url'] = 'https://www.ptt.cc' . $matched[1];

        preg_match('/\[(.+)\](.+)/', strip_tags($cols['title']), $matched);
        $cols['type'] = trim($matched[1]);
        $cols['title'] = trim($matched[2]);

        return $cols;
    }

    private function parseRows($html)
    {
        preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);

        return $matches[0];
    }
}

我們的回傳是 array,但我想知道分頁的資訊這時候可以怎麼做呢?(在修改前一樣先執行一次測試)

因為 array 是沒辦法夾帶額外資訊的,分頁我們改為回傳 Paginator 的物件即可,但回傳物件是不是測試要再做調整?

這時我們就可以使用兩個 interface

就可以讓物件可以像 array 一樣操作,並能記算數量,最終我們可以實作出 Paginator 的程式碼

<?php

namespace Recca0120\Ithome30;

use ArrayAccess;
use Countable;

class Paginator implements ArrayAccess, Countable
{
    public $meta = [];

    public function __construct(string $html, public array $items, public int $currentPage)
    {
        // 把分析分頁的程式碼直接移過來
        preg_match_all('/<a class="btn wide( disabled)?"( href="(?<href>.+)")?>.*(?<name>(最舊|上頁|下頁|最新))[^<]*?<\/a>/', $html, $matches);
        $lookup = ['最舊' => 'oldest', '上頁' => 'prev', '下頁' => 'next', '最新' => 'latest'];
        foreach (array_keys($matches[0]) as $index) {
            $this->meta[$lookup[$matches['name'][$index]]] = $matches['href'][$index]
                ? 'https://www.ptt.cc' . $matches['href'][$index]
                : null;
        }
    }

    public function items()
    {
        return $this->items;
    }

    public function hasMorePage()
    {
        return $this->meta['prev'] !== null;
    }

    public function offsetExists($key): bool
    {
        return isset($this->items[$key]);
    }

    public function offsetGet($key): mixed
    {
        return $this->items[$key];
    }

    public function offsetSet($key, $value): void
    {
        $this->items[$key] = $value;
    }

    public function offsetUnset($key): void
    {
        unset($this->items[$key]);
    }

    public function count(): int
    {
        return count($this->items);
    }
}

我們的 Board.php 程式碼則為

<?php

namespace Recca0120\Ithome30\Crawlers;

use GuzzleHttp\Psr7\Request;
use Recca0120\Ithome30\Paginator;
use Psr\Http\Client\ClientInterface;

class Board
{
    public function __construct(private ClientInterface $httpClient)
    {
    }

    public function fetch(array $board, ?int $take = null)
    {
        $url = $board['url'];

        $results = [];
        $page = 0;
        do {
            $page++;
            $html = $this->sendRequest($url);
            $rows = array_map(
                fn (string $row)  => $this->parseCols($row, $board),
                $this->parseRows($html)
            );
            $paginator = new Paginator($html, $rows, $page);
            $results[] =  $paginator;
            if ($take !== null && $paginator->currentPage >= $take) {
                break;
            }
            $url = $paginator->meta['prev'];
        } while ($paginator->hasMorePage());

        return $results;
    }

    private function sendRequest($url)
    {
        $request = new Request('GET', $url, [
            'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding' => 'gzip, deflate, br',
            'Accept-Language' => 'zh-TW,zh;q=0.8',
            'Cache-Control' => 'max-age=0',
            'Cookie' => 'over18=1',
            'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
            'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'Sec-Ch-Ua-Mobile' => '?0',
            'Sec-Ch-Ua-Platform' => '"macOS"',
            'Sec-Fetch-Dest' => 'document',
            'Sec-Fetch-Mode' => 'navigate',
            'Sec-Fetch-Site' => 'same-origin',
            'Sec-Fetch-User' => '?1',
            'Sec-Gpc' => '1',
            'Upgrade-Insecure-Requests' => '1',
            'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
        ]);
        $response = $this->httpClient->sendRequest($request);
        $html = (string)$response->getBody();

        return $html;
    }

    private function parseCols($row, $board)
    {
        preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches);

        $cols = [
            'board_name' => $board['name'],
            'board_class' => $board['class'],
        ];

        foreach (array_keys($matches[0]) as $index) {
            $cols[$matches['name'][$index]] = trim($matches['value'][$index]);
        }
        $cols['nrec'] = strip_tags($cols['nrec']);

        preg_match('/href="(.*)"/', $cols['title'], $matched);
        $cols['url'] = 'https://www.ptt.cc' . $matched[1];

        preg_match('/\[(.+)\](.+)/', strip_tags($cols['title']), $matched);
        $cols['type'] = trim($matched[1]);
        $cols['title'] = trim($matched[2]);

        return $cols;
    }

    private function parseRows($html)
    {
        preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);

        return $matches[0];
    }
}

這時再執行測試時依然會是綠燈,所以重構的過程,我們可以使 PHP 內建的功能來進行重構,讓原功能保持一致,但同時又擴充新功能。在修改完程式碼後,我們可以立刻使用PHPUnit來確認程式修改無誤。


上一篇
Day 21. 爬蟲文章列表文頁 - 測試案例是可以調整的
下一篇
Day 23. 使用 Generator 重構分頁 - 更快速得到回應
系列文
開心撰寫 PHPUnit30
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言